library(GEOquery)
library(crayon)
library(tidyverse)
library(Seurat)
library(SeuratData)
library(Azimuth)
library(patchwork)
library(Matrix)
library(RCurl)
library(DoubletFinder)
library(scales)
library(SoupX)
library(cowplot)
library(metap)
library(SingleCellExperiment)
library(DropletUtils)
library(AnnotationHub)
library(HGNChelper)
library(ensembldb)
library(multtest)
library(glmGamPoi)
library(pbapply)
library(data.table)
library(ggrepel)
library(ggpubr)
library(stringr)
library(canvasXpress)
library(clinfun)
library(GGally)
library(factoextra)
library(DESeq2)
library(limma)
library(fgsea)
library(org.Mm.eg.db)
library(kableExtra)

Step 1: Data Exploration and Import

Step 1.1: Read in all sample matrix and create Seurat object

# Specify path to supplementary files. 
supp_file_path = "~/Documents/Work/UnityHealth/Data_Projects/scRNAseq_LungFibrosisAnalysis/GSE264162/"

# Get unique sample names.
sample_names = list.files(path = paste0(supp_file_path, "raw_data"), pattern = "GSM", full.names = FALSE)
sample_names
##  [1] "GSM8213185_LG_AA_S1_1" "GSM8213185_LG_AA_S1_2" "GSM8213186_LG_AA_S2_1"
##  [4] "GSM8213186_LG_AA_S2_2" "GSM8213187_LG_AA_S3_1" "GSM8213187_LG_AA_S3_2"
##  [7] "GSM8213188_LG_AA_S4_1" "GSM8213188_LG_AA_S4_2" "GSM8213189_LG_AA_S5_1"
## [10] "GSM8213189_LG_AA_S5_2" "GSM8213190_LG_AA_S6_1" "GSM8213190_LG_AA_S6_2"
## [13] "GSM8213191_LG_AA_S7_1" "GSM8213191_LG_AA_S7_2"
# Create Seurat object for each sample.
for (file in sample_names){
  seurat_data <- Read10X(data.dir = paste0(supp_file_path, "raw_data/", file))
  seurat_obj <- CreateSeuratObject(counts = seurat_data,
                                   min.features = 100,
                                   project = file)
  assign(file, seurat_obj)
}

# Remove unneeded objects from the environment.
rm(file, seurat_data, seurat_obj)

Step 1.2: Merge Seurat objects

# Merge Seurat objects.
merged_seurat <- merge(x = GSM8213185_LG_AA_S1_1,
                       y = c(GSM8213185_LG_AA_S1_2,
                             GSM8213186_LG_AA_S2_1,
                             GSM8213186_LG_AA_S2_2,
                             GSM8213187_LG_AA_S3_1,
                             GSM8213187_LG_AA_S3_2,
                             GSM8213188_LG_AA_S4_1,
                             GSM8213188_LG_AA_S4_2,
                             GSM8213189_LG_AA_S5_1,
                             GSM8213189_LG_AA_S5_2,
                             GSM8213190_LG_AA_S6_1,
                             GSM8213190_LG_AA_S6_2,
                             GSM8213191_LG_AA_S7_1,
                             GSM8213191_LG_AA_S7_2),
                       add.cell.id = c("Young_Sham_1_rep1",
                                       "Young_Sham_1_rep2",
                                       "Young_Sham_2_rep1",
                                       "Young_Sham_2_rep2",
                                       "Young_Sham_3_rep1",
                                       "Young_Sham_3_rep2",
                                       "Bleo_14_days_1_rep1",
                                       "Bleo_14_days_1_rep2",
                                       "Bleo_14_days_2_rep1",
                                       "Bleo_14_days_2_rep2",
                                       "Bleo_35_days_1_rep1",
                                       "Bleo_35_days_1_rep2",
                                       "Bleo_35_days_2_rep1",
                                       "Bleo_35_days_2_rep2"))

# Concatenate the count matrices of the samples together.
merged_seurat <- JoinLayers(merged_seurat)

# Remove individual Seurat objects after successful merge.
rm(list = ls(pattern = "GSM"))
gc()
##             used   (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells  15829460  845.4   25595414  1367.0         NA   25595414  1367.0
## Vcells 333236570 2542.4 1571576192 11990.2    1024000 1894557522 14454.4

Step 1.3: Read metadata

# Pull GEO metadata.
GSE264162_meta <- getGEO(GEO = "GSE264162", GSEMatrix = TRUE)
## Found 1 file(s)
## GSE264162_series_matrix.txt.gz
GSE264162_meta <- pData(GSE264162_meta$GSE264162_series_matrix.txt.gz)

# Subset metadata.
GSE264162_meta <- GSE264162_meta %>% subset(select = c(title, 
                                                       geo_accession,
                                                       organism_ch1,
                                                       characteristics_ch1.5,
                                                       `age:ch1`,
                                                       `cell type:ch1`,
                                                       `genotype:ch1`,
                                                       `strain:ch1`,
                                                       `tissue:ch1`))

# Rename columns to more accessible names.
colnames(GSE264162_meta) <- c("title",
                              "geo_accession",
                              "organism",
                              "treatment",
                              "age",
                              "cell_type",
                              "genotype",
                              "strain",
                              "tissue")

# Edit the metadata columns.
GSE264162_meta$title <- gsub(" ", "_", GSE264162_meta$title)
GSE264162_meta$title <- gsub("-", "_", GSE264162_meta$title)
GSE264162_meta$treatment <- gsub("treatment: ", "", GSE264162_meta$treatment)

# Add timepoint variable.
GSE264162_meta$timepoint <- ifelse(GSE264162_meta$treatment == "Sham", "Sham", "")
GSE264162_meta$timepoint <- ifelse(GSE264162_meta$timepoint == "", paste0("Day ", 
                                                                          gsub("Bleo_", "",
                                                                               gsub("_days_1", "",
                                                                                    gsub("_days_2", "",
                                                                                         GSE264162_meta$title)))), 
                                   GSE264162_meta$timepoint)

Step 1.4: Add metadata variables from GEO

We can merged the metadata from GEO and put the merged metadata back into the Seurat object.

# Get metadata from Seurat object.
metadata = merged_seurat@meta.data %>% rownames_to_column()
dim(metadata)
## [1] 86347     4
# Create a variable column to merge dataframes.
metadata$geo_accession <- str_extract(metadata$orig.ident, "[^_]+")

# Merge metadata with GEO metadata by the newly created geo_accession column.
metadata <- merge(metadata, GSE264162_meta, by = "geo_accession")
dim(metadata)
## [1] 86347    14
# Edit and retain only required columns.
metadata <- metadata %>% column_to_rownames() %>% subset(select = -geo_accession)

# Generate quality metrics.
names(metadata)[names(metadata) == "nCount_RNA"] <- "nUMI"
names(metadata)[names(metadata) == "nFeature_RNA"] <- "nGene"

# Compute mitochondrial ratio.
metadata$mitoRatio = PercentageFeatureSet(object = merged_seurat, pattern = "^mt-")
metadata$mitoRatio = metadata$mitoRatio / 100

# Calculate novelty score.
metadata$log10GenesPerUMI <- log10(metadata$nGene) / log10(metadata$nUMI)

# Add the new metadata back to the Seurat object. 
merged_seurat@meta.data <- metadata

# Remove unneeded objects.
rm(GSE264162_meta, metadata, sample_names)

Step 2: Quality Control

Step 2.1: Visualize cell counts

Check cell counts with number of unique barcodes detected to determine capture efficiency.

# Extract metadata from Seurat object.
metadata = merged_seurat@meta.data

# Visualize cell counts.
metadata %>%
  ggplot(aes(x = treatment, fill = treatment)) +
  geom_bar() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("NCells")

Step 2.2: Visualize UMI counts per cell

The UMI counts per cell should generally be above 500, that is the low end of what we expect. If UMI counts are between 500-1000 counts, it is usable but the cells probably should have been sequenced more deeply.

# Visualize UMI counts per cell.
metadata %>% 
  ggplot(aes(x = nUMI, color = treatment, fill = treatment)) +
  geom_density(alpha = 0.2) +
  scale_x_log10(labels = label_number()) +
  ylab("Cell density") +
  geom_vline(xintercept = 500, linetype = "dashed", color = "red") +
  theme_classic()

Step 2.3: Visualize novelty score of cells

Generally, we expect the novelty score to be above 0.80 for good quality cells.

# Visualize novelty score.
metadata %>% 
  ggplot(aes(x = log10GenesPerUMI, color = treatment, fill = treatment)) +
  geom_density(alpha = 0.2) +
  geom_vline(xintercept = 0.8, linetype = "dashed", color = "red") +
  theme_classic()

Step 2.4: Visualize mitochondrial ratio

In general, poor quality samples have mitochondrial ratio higher than 0.2 mark.

# Visualize mitochondrial ratio.
metadata %>%
  ggplot(aes(x = mitoRatio, color = treatment, fill = treatment)) +
  geom_density(alpha = 0.2) +
  scale_x_log10() +
  geom_vline(xintercept = 0.2, linetype = "dashed", color = "red") +
  theme_classic()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 532 rows containing non-finite outside the scale range
## (`stat_density()`).

Step 2.5: Visualize joint filtering effects

Considering any of these QC metrics in isolation can lead to misinterpretation of cellular signals. Jointly visualizing the count and gene thresholds and additionally overlaying the mitochondrial fraction, gives a summarized persepective of the quality per cell.

# Visualize joint filtering QCs.
metadata %>% 
    ggplot(aes(x = nUMI, y = nGene, color = mitoRatio)) + 
    geom_point() + 
    scale_colour_gradient(low = "gray90", high = "black") +
    stat_smooth(method = lm) +
    scale_x_log10() + 
    scale_y_log10() + 
    geom_vline(xintercept = 500) +
    geom_hline(yintercept = 250) +
    theme_classic() +
    facet_wrap(~treatment)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Step 2.6: Visualize valid cells with knee plot

# Rank expression barcodes. 
br.out = barcodeRanks(merged_seurat@assays$RNA$counts)

# Plot knee plot.
plot(br.out$rank, br.out$total, log = "xy", xlab = "Rank", ylab = "Total")
abline(h = metadata(br.out)$knee, col = "dodgerblue", lty = 2)
abline(h = metadata(br.out)$inflection, col = "forestgreen", lty = 2)
legend("bottomleft", lty = 2, col = c("dodgerblue", "forestgreen"), legend = c("knee", "inflection"))

# Remove unneeded object.
rm(br.out)

Step 2.7: Visualize doublets

Before running DoubletFinder(), we need to run a quick Seurat processing pipeline first.

# Run quick Seurat processing pipeline.
set.seed(10)
seur_obj <- NormalizeData(merged_seurat)
seur_obj <- FindVariableFeatures(seur_obj)
seur_obj <- ScaleData(seur_obj)
seur_obj <- RunPCA(seur_obj)
seur_obj <- FindNeighbors(seur_obj)
seur_obj <- FindClusters(seur_obj)

Determine the threshold for the pANN score that corresponds to the expected number of heterotypic doublets (parameter: nEXP = expected number of heterotypic doublets).

# Calculate homotypic doublet proportion.
type.freq <- table(seur_obj@meta.data$seurat_clusters) / ncol(seur_obj)
homotypic.prop <- sum(type.freq^2)

# Calculate nEXP based on number of heterotypic doublet.
nEXP = 0.009 * (ncol(seur_obj)/1000) * (1-homotypic.prop) * ncol(seur_obj)
nEXP

# Define pN and nPC.
pN = 0.25
PCs = 25

# Determine bimodality coeffient (pK).
sweep.out <- paramSweep(seur_obj, PCs = 1:PCs, sct = FALSE)
sweep.stats <- summarizeSweep(sweep.out)

Summarize sweep output to get the ideal coefficient values.

# Summarize coefficient output.
plot(sweep.stats[, 2:3])
sweep.stats[as.numeric(as.character(sweep.stats[, 2])) < 0.075 & sweep.stats[, 3] > 0.8,]

# Clear memory.
gc()

# Remove unneeded objects.
rm(homotypic.prop, sweep.out, type.freq)

pK = 0.0005 yields consistently high scores with pN = 0.05 producing the highest score.

# Run DoubletFinder using coefficients from paramSweep.
pK = 0.0005
pN = 0.05
seur_obj <- doubletFinder(seur_obj, PCs = 1:PCs, pN = pN, pK = pK, nExp = nEXP)

# DoubletFinder adds the doublet score in the meta.data slot.
head(seur_obj@metadata)

# Grab the metadata and rename relevant columns.
doublet_meta = seur_obj@meta.data
names(doublet_meta)[grep("pANN", colnames(doublet_meta))] <- "pANN"
names(doublet_meta)[grep("DF.classifications", colnames(doublet_meta))] <- "doublet_class"

# Keep relevant columns only.
doublet_meta <- doublet_meta %>% 
  subset(select = c("orig.ident", "pANN", "doublet_class")) %>%
  rownames_to_column("matched_id")

# Get metadata from the Seurat object to merge.
metadata = merged_seurat@meta.data %>%
  rownames_to_column("matched_id")

# Check if matched_id for both dataframes match before merging.
identical(metadata$matched_id, doublet_meta$matched_id) # Must be TRUE.

# Merge both metadata together.
metadata <- left_join(metadata, doublet_meta, by = "matched_id") %>%
  subset(select = -orig.ident.y) %>%
  column_to_rownames("matched_id")

# Add merged metadata back to the Seurat object.
all(colnames(merged_seurat) == rownames(metadata)) # Must be TRUE.
merged_seurat@meta.data <- metadata

Step 2.8: Filtering

Step 2.8.1: Cell-level filtering

Now that we have visualized the various metrics, we can decide on the thresholds to apply which will result in the removal of low quality cells. We will use the following thresholds:

  • nUMI > 500

  • nGene > 250

  • log10GenesPerUMI > 0.8

  • mitoRatio < 0.2

# Get pre-filtering cell counts.
dim(merged_seurat)[2] # 86,347 cells
## [1] 86347
# Low-quality cell filtering.
umi_filtered <- subset(x = merged_seurat, subset = nUMI >= 500)
gene_filtered <- subset(x = umi_filtered, subset = nGene >= 250)
genesperumi_filtered <- subset(x = gene_filtered, subset = log10GenesPerUMI > 0.80)
mitoratio_filtered <- subset(x = genesperumi_filtered, subset = mitoRatio < 0.20)

# Get post-filtering Seurat object. 
# Enter level of filtering desired here.
cell_filtered_seurat = mitoratio_filtered

# Get counts after low-quality cell filtering.
dim(cell_filtered_seurat)[2] # 80,505 cells.
## [1] 80505

We can visualize the amount of cells removed and the number of cells remaining for each filtering step in the plot below.

Step 2.8.2: Gene-level filtering

We also remove genes that have zero expression in all cells and genes that are only expressed in 10 or less cells as genes that are expressed in only a handful of cells are not meaningful and can bring down the averages for all the other cells that they are not expressed in.

# Extract count matrix.
counts <- GetAssayData(object = cell_filtered_seurat, layer = "counts")

# Get a logical vector for every gene that has more than zero counts per cell.
nonzero <- counts > 0

# Get a logical vector of genes that are expressed in 10 or more cells.
keep_genes <- Matrix::rowSums(nonzero) >= 10

# Subset counts to keep genes that are non-zero and expressed in 10 or more cells.
filtered_counts <- counts[keep_genes, ]

# Create new Seurat object with the subsetted counts.
filtered_seurat <- CreateSeuratObject(filtered_counts, meta.data = cell_filtered_seurat@meta.data)

We can visualize the amount of genes removed and the number of genes remaining for each filtering step in the plot below.

Step 3: Normalization

Step 3.1: Pre-normalization visualization

Let’s have a look of our expression distribution before normalization.

# Visualize expression distribution before normalization.
hist(colSums(filtered_seurat[["RNA"]]$counts), 
     breaks = 100, 
     col = "#212226",
     main = "Pre-Normalization: Expression distribution", 
     xlab = "Sum of expression")

Step 3.2: Simple normalization

Before we make any comparisons across cells, we will apply a simple normalization. This is solely for the purpose of exploring the sources of variation in our data.

# Apply simple normalization to merged Seurat objects.
seurat_phase <- NormalizeData(filtered_seurat)
## Normalizing layer: counts
# Visualize expression distribution after simple normalization.
hist(colSums(seurat_phase[["RNA"]]$data), 
     breaks = 100, 
     col = "#b8520a",
     main = "Post-Simple Normalization: Expression distribution", 
     xlab = "Sum of expression")

# Remove unneeded objects.
rm(filtered_seurat)

Step 3.3: Evaluate effects of cell cycle

Step 3.3.1: Set up cell cycle scores

# Download cell cycle genes for organism at https://github.com/hbc/tinyatlas/tree/master/cell_cycle. 
cc_file <- getURL("https://raw.githubusercontent.com/hbc/tinyatlas/master/cell_cycle/Mus_musculus.csv") 
cell_cycle_genes <- read.csv(text = cc_file)

# Remove unneeded objects.
rm(cc_file)

All of the cell cycle genes are Ensembl IDs, but our gene IDs are the gene names. To score the genes in our count matrix for cell cycle, we need to obtain the gene names for the cell cycle genes.

# Connect to AnnotationHub.
ah <- AnnotationHub()

# Access the Ensembl database for organism.
ahDb <- query(ah, 
              pattern = c("Mus musculus", "EnsDb"), 
              ignore.case = TRUE)

# Acquire the latest annotation files.
id <- ahDb %>%
        mcols() %>%
        rownames() %>%
        tail(n = 1)

# Download the appropriate Ensembldb database.
edb <- ah[[id]]
## loading from cache
# Extract gene-level information from database.
annotations <- genes(edb, 
                     return.type = "data.frame")

# Select annotations of interest.
annotations <- annotations %>%
        dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)

# Remove unneeded objects.
rm(ah, ahDb, edb, id)

We can merge the Ensembl genes to the cell cycle genes.

# Get gene names for Ensembl IDs for each gene.
cell_cycle_markers <- dplyr::left_join(cell_cycle_genes, annotations, by = c("geneID" = "gene_id"))

# Acquire the S phase genes.
s_genes <- cell_cycle_markers %>%
        dplyr::filter(phase == "S") %>%
        pull("gene_name")
        
# Acquire the G2M phase genes.        
g2m_genes <- cell_cycle_markers %>%
        dplyr::filter(phase == "G2/M") %>%
        pull("gene_name")

# Remove unneeded objects.
rm(cell_cycle_genes, cell_cycle_markers, annotations)

We can then use the annotated cell cycle genes to perform cell cycle scoring.

# Perform cell cycle scoring
seurat_phase <- CellCycleScoring(seurat_phase,
                                 g2m.features = g2m_genes,
                                 s.features = s_genes)

# Remove unneeded objects.
rm(g2m_genes, s_genes)

Step 3.3.2: Use PCA to evaluate effects of cell cycle

# Identify the most variable genes.
seurat_phase <- FindVariableFeatures(seurat_phase, 
                                     selection.method = "vst",
                                     nfeatures = 2000, 
                                     verbose = FALSE)
             
# Scale the counts.
seurat_phase <- ScaleData(seurat_phase)
## Centering and scaling data matrix
# Visualize expression distribution after scaling.
hist(colSums(seurat_phase[["RNA"]]$scale.data), 
     breaks = 100, 
     col = "#779e7f",
     main = "Post-Scaling: Expression distribution", 
     xlab = "Sum of expression")

# Identify the 15 most highly variable genes.
ranked_variable_genes <- VariableFeatures(seurat_phase)
top_genes <- ranked_variable_genes[1:15]

# Plot the average expression and variance of these genes with labels to indicate which genes are in the top 15.
p <- VariableFeaturePlot(seurat_phase)
LabelPoints(plot = p, points = top_genes, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results

# Remove unneeded objects.
rm(ranked_variable_genes, top_genes, p)
# Perform PCA.
seurat_phase <- RunPCA(seurat_phase)
## PC_ 1 
## Positive:  Ccdc153, Tmem212, 1110017D15Rik, Fam183b, Rsph1, Dynlrb2, Foxj1, Cfap126, Cyp2s1, Arhgdig 
##     Sec14l3, 1700007K13Rik, Cd24a, Ccdc113, Mlf1, Gm19935, Riiad1, 1700016K19Rik, Aldh3b1, 1700094D03Rik 
##     Pifo, Tspan1, BC051019, Cfap53, Cdhr3, Mns1, Tctex1d4, Sntn, Ak7, Traf3ip1 
## Negative:  Sparc, Vim, Igfbp7, Col4a1, Bgn, Mgp, Apoe, Col6a1, Adamts1, Col3a1 
##     Col1a2, Fstl1, Sparcl1, Col1a1, Lgals1, Eln, Ltbp4, Igfbp4, Fbn1, Serpinh1 
##     Col5a2, Col4a2, Selenom, Cfh, Loxl1, Serpina3n, Ptgis, Lrp1, Emp3, Colec12 
## PC_ 2 
## Positive:  Sftpa1, Napsa, Cxcl15, Sftpd, Slc34a2, Lyz2, Lamp3, Cd74, Scd1, S100g 
##     Ager, Cldn18, Fabp5, H2-Aa, Chil1, Hc, Egfl6, Aqp5, Elovl1, Ctsh 
##     H2-Ab1, Bex4, Lrp2, Acoxl, Slc39a8, Chia1, Mal, Lcn2, Wfdc2, Bex2 
## Negative:  Col3a1, Col1a2, Serping1, Col1a1, Bgn, Mgp, Gsn, Loxl1, Lrp1, Lgals1 
##     Fbn1, Rbp1, Plxdc2, Mfap5, Mfap4, Pmepa1, Col5a2, Fn1, Mmp2, Fstl1 
##     Adamts2, Rarres2, Olfml3, Sod3, Sparcl1, Dpt, Col6a3, Colec12, Ptgis, Cfh 
## PC_ 3 
## Positive:  Sftpd, Napsa, Cxcl15, Chil1, Slc34a2, Scd1, Sftpa1, Lamp3, Lyz2, S100g 
##     Cldn18, Ager, Egfl6, Hc, Fabp5, Aqp5, Selenbp1, Mgst1, Elovl1, Ctsh 
##     Bex4, Wfdc2, Lrp2, Hp, Cd74, Col6a1, Acoxl, Prdx6, Apoc1, Il33 
## Negative:  Ramp2, Cdh5, Cldn5, Egfl7, Calcrl, Pecam1, Ctla2a, Ptprb, Aqp1, Ehd4 
##     Rasip1, Ecscr, Flt1, Eng, Icam2, Plvap, S1pr1, Srgn, Cd93, Ly6a 
##     Tspan7, Cyyr1, Podxl, Lyve1, Cavin2, Slfn5, Gimap6, Acvrl1, Cemip2, Clec14a 
## PC_ 4 
## Positive:  Inmt, Scn7a, Fmo2, Ogn, Ly6a, Tmem100, Pcolce2, Clec3b, Fmo1, Hpgd 
##     Dpep1, Plpp3, Mfap4, Acvrl1, C7, Mamdc2, Clec14a, Tspan7, Flt1, Sept4 
##     Tcf21, Atp1a2, Apoe, Ace, Iigp1, Gsn, Thbd, Hsd11b1, Gstm1, Abca8a 
## Negative:  Birc5, Mki67, Tpx2, Cdca8, Cdk1, Pclaf, Ccna2, Ube2c, Cdca3, Hmmr 
##     Top2a, Cenpf, Racgap1, Prc1, Cenpe, Anln, Ccnb1, Cenpa, Pbk, Plk1 
##     Ccnb2, Cdc20, Aurkb, Nusap1, Ckap2l, Knl1, Kif11, Knstrn, Cdkn3, Tacc3 
## PC_ 5 
## Positive:  Fabp5, Chil1, Lyz2, Slc34a2, Lamp3, Cxcl15, Sftpa1, S100g, Hc, Scd1 
##     Cbr2, Acoxl, Lcn2, Il33, Egfl6, Lrg1, Bex2, H2-Aa, Napsa, Apoc1 
##     Elovl1, Itih4, Lrp2, Hp, Sftpd, Chia1, Bex4, H2-Ab1, Trf, Aox3 
## Negative:  Rtkn2, Col4a3, Akap5, Spock2, Scnn1g, Col4a4, Igfbp2, Krt7, Itgb6, Hopx 
##     Ndnf, Abca5, Flrt3, Clic5, Tmem37, Ano1, 2210011C24Rik, Gprc5a, Lama3, Cdkn2b 
##     Cryab, Emp2, Lamc2, Hck, Pxdc1, Pdpn, Nt5e, Tspan8, Fam189a2, Scnn1b
# Plot the PCA colored by cell cycle phase.
DimPlot(seurat_phase,
        reduction = "pca",
        group.by = "Phase",
        split.by = "Phase")

# Split effects of PCA by treatment.
DimPlot(seurat_phase,
        reduction = "pca",
        group.by = "Phase",
        split.by = "timepoint")

We don’t see large differences due to the effects of cell cycle, so cell cycle effects are not regressed out.

Step 3.4: SCTransform normalization

# Free up memory.
gc()
##             used   (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells  16322719  871.8   25595414  1367.0         NA   25595414  1367.0
## Vcells 776961035 5927.8 1515293976 11560.8    1024000 1894557522 14454.4
# Set memory limit.
options(future.globals.maxSize = 8000 * 1024^2)

# Perform SCTransform normalization.  
seurat_sct <- SCTransform(seurat_phase, 
                          vars.to.regress = c("mitoRatio"),
                          vst.flavor = "v2",
                          conserve.memory = TRUE,
                          verbose = FALSE)
## Will not return corrected UMI because residual type is not set to 'pearson'
# Check which objects are stored in which assays. 
seurat_sct@assays
## $RNA
## Assay (v5) data with 22546 features for 80505 cells
## Top 10 variable features:
##  Bpifa1, Scgb3a1, Bpifb1, Ltf, Reg3g, Cxcl13, Spp1, Retnla, Tff2, Saa3 
## Layers:
##  counts, data, scale.data 
## 
## $SCT
## SCTAssay data with 22546 features for 80505 cells, and 1 SCTModel(s) 
## Top 10 variable features:
##  Sftpc, Scgb1a1, Lyz1, Spp1, Mgp, Ccl21a, Scgb3a2, Lyz2, Bpifa1, Scgb3a1
# Remove unneeded objects.
rm(seurat_phase)

The SCTransform normalized counts are stored in the SCT$counts slot. Here is the distribution of the SCTransform normalized counts distribution.

# Visualize expression distribution after scaling.
hist(colSums(seurat_sct[["SCT"]]$counts), 
     breaks = 100, 
     col = "#332f42",
     main = "Post-SCTransform: Expression distribution", 
     xlab = "Sum of expression")

The log-normalized SCTransfromed counts are also stored in the SCT$data slot. Here is the distribution of the log-normalized SCTransformed counts.

hist(colSums(seurat_sct[["SCT"]]$data), 
     breaks = 100, 
     col = "#b14b34",
     main = "Post-SCTransformed: Log-normalized expression distribution", 
     xlab = "Sum of expression")

Step 4: Dimensional Reduction and Clustering

The goal of this step is to perform unsupervised clustering to identify groups of cells based on similarities of their transcriptomes without any prior knowledge of the labels or the number of clusters. This can be challenging due to the high level of noise and large number of dimensions, so it is often beneficial to apply a dimensionality reduction method to reduce the amount of noise.

Step 4.1: Run PCA

# Run PCA.
seurat_sct <- RunPCA(seurat_sct, verbose = FALSE)

# Visualize dimension reduction from PCA.
seurat_sct <- RunUMAP(seurat_sct, dims = 1:40, reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:38:05 UMAP embedding parameters a = 0.9922 b = 1.112
## 11:38:05 Read 80505 rows and found 40 numeric columns
## 11:38:05 Using Annoy for neighbor search, n_neighbors = 30
## 11:38:05 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:38:12 Writing NN index file to temp file /var/folders/1c/sbnd3yvd7mzg5lng1jcr70tw0000gn/T//RtmpG4X7bv/fileaad2c52b519
## 11:38:12 Searching Annoy index using 1 thread, search_k = 3000
## 11:38:38 Annoy recall = 100%
## 11:38:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 11:38:43 Initializing from normalized Laplacian + noise (using RSpectra)
## 11:38:48 Commencing optimization for 200 epochs, with 3408926 positive edges
## 11:39:21 Optimization finished
DimPlot(object = seurat_sct, group.by = "treatment")

Step 4.2: Top PC-driving metagenes

# Explore the top PC markers.
DimHeatmap(seurat_sct, 
           dims = 1:9, 
           cells = 500, 
           balanced = TRUE)

We can visualize an elbow plot to see how many PCs to use for clustering.

# Build the elbow plot.
# Determine percent of variation associated with each PC.
pct <- seurat_sct[["pca"]]@stdev / sum(seurat_sct[["pca"]]@stdev) * 100

# Calculate cumulative percents for each PC.
cumu <- cumsum(pct)

# Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5.
co1 <- which(cumu > 90 & pct < 5)[1]
co1
## [1] 41
# Determine the difference between variation of PC and subsequent PC.
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1

# Last point where change of % of variation is more than 0.1%.
co2
## [1] 25
# Minimum of the two calculation.
pcs <- min(co1, co2)
pcs
## [1] 25
# Create a dataframe with values.
elbowplot_df <- data.frame(pct = pct, 
                           cumu = cumu, 
                           rank = 1:length(pct))

# Elbow plot to visualize.
ggplot(elbowplot_df, aes(cumu, pct, label = rank, color = rank > pcs)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
  theme_bw()

# Remove unneeded objects.
rm(co1, co2, cumu, elbowplot_df, pcs, pct)

Based on the elbow plot, we will use approx. 15 PCs to generate the clusters.

Step 4.3: Cluster the cells

The basic steps towards complete clustering involve:

  • Measure of similarity: How does one quantify how close two data points are?

  • Quality function: How does one define the clustering/partition of the points?

  • Algorithm: How to find the clustering whereby the quality function is optimized?

Step 4.3.1: Find neighbours

We use a graph-based community detection method where each vertex represents a cell and the weight of the edge measures similarities between two cells. Put simply, we start by building an unweighted K-nearest neighbour (k-NN) graph and add weights to obtain a shared nearest neighbour (SNN) graph. Weight can be added either by finding out the shared nodes (overlap) between two neighbours (more nodes; more similar the neighbours; more weight) or measuring the closeness of both neighbours to each other.

# Compute SNN graph.
seurat_cluster <- FindNeighbors(object = seurat_sct, dims = 1:15) # Based on elbow plot.
## Computing nearest neighbor graph
## Computing SNN
# Remove unneeded object.
rm(seurat_sct)

Step 4.3.2: Find clusters

Next, we want to iteratively group the neighbours together with the goal of combining the cells with similar transcriptomes together. Without knowing the number of clusters a priori, we use a range of resolution based on the approximation of the recommendation where datasets of 3,000 - 5,000 cells should set the resolution between 0.4 - 1.4. We have approximately 90,000 cells, so our range will start from 1.5 - 2.5.

# Determine start and end resolution range.
start_reso_range = 0.2
end_reso_range = 1.0
range_interval = 0.2

# Find clusters given the range.
seurat_cluster <- FindClusters(object = seurat_cluster,
                               resolution = seq(from = start_reso_range,
                                                to = end_reso_range,
                                                by = range_interval))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80505
## Number of edges: 2500763
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9649
## Number of communities: 13
## Elapsed time: 34 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80505
## Number of edges: 2500763
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9498
## Number of communities: 23
## Elapsed time: 27 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80505
## Number of edges: 2500763
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9379
## Number of communities: 26
## Elapsed time: 31 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80505
## Number of edges: 2500763
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9275
## Number of communities: 32
## Elapsed time: 32 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 80505
## Number of edges: 2500763
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9193
## Number of communities: 39
## Elapsed time: 28 seconds
# Remove unneeded objects.
rm(start_reso_range, end_reso_range, range_interval)

We can look at the maximum number of clusters found at each resolution with the output below.

# Create a dummy table to contain the max cluster for each resolution.
maxclust = c()
clustoutput = seurat_cluster@meta.data %>% subset(select = grep("SCT_snn_", colnames(.)))
clustoutput[] <- lapply(clustoutput, function(x) as.numeric(as.character(x)))

# Iterate and print the maximum cluster count for each resolution.
for (i in 1:ncol(clustoutput)){
  maxclust[i] = max(clustoutput[, i])
  print(paste0("Max n-cluster for ", 
               names(clustoutput[i]), ": ", 
               maxclust[i]))
}
## [1] "Max n-cluster for SCT_snn_res.0.2: 12"
## [1] "Max n-cluster for SCT_snn_res.0.4: 22"
## [1] "Max n-cluster for SCT_snn_res.0.6: 25"
## [1] "Max n-cluster for SCT_snn_res.0.8: 31"
## [1] "Max n-cluster for SCT_snn_res.1: 38"
# Remove unneeded objects.
rm(clustoutput, i, maxclust)

We can visualize the difference clusterings of the resolutions side by side. In this case, we will look at these resolutions: 0.6, 0.8, 1.0.

# Select resolutions to visualize.
resolutions = c("0.6", "0.8", "1")
feature_cols = paste0("SCT_snn_res.", resolutions)

# Visualize UMAP of different resolutions.
for (reso in feature_cols) {
  Idents(seurat_cluster) <- reso
  p <- DimPlot(seurat_cluster,
               reduction = "umap",
               label = TRUE,
               label.size = 3) + 
    ggtitle(reso) +
    NoLegend()
  print(p)
}

# Remove unneeded objects.
rm(feature_cols, p, reso, resolutions)

Step 4.4: Explore clustering output

Let’s start with the exploration with the highest resolution in the range, 1.0.

# Sort the cluster labels in ascending order.
reso_columns = grep("SCT_snn_", colnames(seurat_cluster@meta.data), value = TRUE)
metadata <- seurat_cluster@meta.data
for (col in colnames(metadata)) {
  if (col %in% reso_columns){
    metadata[, col] <- factor(metadata[, col], levels = paste(sort(as.integer(levels(metadata[, col])))))
  }
}
seurat_cluster@meta.data <- metadata

# Assign desired cluster resolution as identity of Seurat.
Idents(object = seurat_cluster) <- "SCT_snn_res.1"

# Visualize cluster with a UMAP.
DimPlot(seurat_cluster,
        reduction = "umap",
        label = TRUE,
        label.size = 3,
        repel = TRUE) + 
  ggtitle("SCT_snn_res.1.0") +
  NoLegend()

# Remove unneeded objects.
rm(col, metadata, reso_columns)

Step 4.5: Explore clustering quality control

Step 4.5.1: Distribution of cells by treatment

# Extract number of cells per cluster.
n_cells <- FetchData(seurat_cluster,
                     vars = c("ident", "treatment")) %>%
  dplyr::count(ident, treatment) 

# Visualize distribution of cells by cluster in a barplot.
ggplot(n_cells, aes(x = ident, y = n, fill = treatment)) +
  geom_bar(position = position_dodge(), 
           stat = "identity") +
  geom_text(aes(label = n), 
            size = 2, 
            vjust = -0.2, 
            position = position_dodge(1)) +
  theme_classic() + 
  theme(axis.text.x = element_text(size = 5))

# Visualize distribution of treatment in each cluster.
DimPlot(seurat_cluster, 
        label = TRUE, 
        split.by = "treatment",
        label.size = 3,
        repel = TRUE) + 
  theme(axis.text.x = element_text(size = 6),
        axis.text.y = element_text(size = 6)) +
  NoLegend()

# Remove unneeded object.
rm(n_cells)

Step 4.5.2: Mitochondrial enrichment in clusters

We would like to see if there is any enrichment of mitochondria in our clusters and if we need to regress out the mitochondrial level in our data.

# Plot mitoRatio to see if any cluster is enriched in mitochondrial level.
FeaturePlot(seurat_cluster, 
            reduction = "umap", 
            features = "mitoRatio",
            cols = c("#d6cfc7", "#01013f"),
            pt.size = 0.5, 
            order = TRUE,
            min.cutoff = "q10",
            label = TRUE,
            label.size = 3,
            label.color = "#960000") 

There are certain clusters enriched with mitochondria, but they are within the acceptable ratio of under 0.20.

Step 4.5.3: Variation in clusters due to cell cycle phase

If our cell clusters show large differences in cell cycle expression, we might want to regress out the effects of cell cycle phase.

# Explore whether clusters segregate by cell cycle phase.
DimPlot(seurat_cluster,
        label = TRUE, 
        split.by = "Phase",
        label.size = 4, 
        repel = TRUE) + 
  theme(axis.text.x = element_text(size = 6),
        axis.text.y = element_text(size = 6)) +
  NoLegend()

We don’t see any noticeable cell cycle phase effects in our clusters.

Step 5: Cell Typing

Step 5.1: Exploration of cluster identities

Step 5.1.1: Explore PC metagenes

We can explore if any specific PCs drive the separation of the clusters.

# Define columns in the metadata to visualize.
columns = c(paste0("PC_", 1:12), "ident", "umap_1", "umap_2")

# Extract data from Seurat object based on column names.
pc_data <- FetchData(seurat_cluster, vars = columns)

# Add cluster label to the centre of cluster on UMAP. 
umap_label = FetchData(seurat_cluster, 
                       vars = c("ident", "umap_1", "umap_2")) %>%
  group_by(ident) %>%
  summarise(x = mean(umap_1), y = mean(umap_2))

# Plot a UMAP plot for each of the PCs.
map(paste0("PC_", 1:12), function(pc){
  ggplot(pc_data, aes(umap_1, umap_2)) +
    geom_point(aes_string(color = pc), alpha = 0.7) +
    scale_color_gradient(guide = "none", low = "#d6cfc7", high = "#01013f") +
    geom_text(data = umap_label, aes(label = ident, x, y), color = "#960000", size = 4) +
    theme_classic() +
    ggtitle(pc)}) %>%
  plot_grid(plotlist = .)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Remove unneeded objects.
rm(columns, pc_data, umap_label)

We can print the genes driving different PCs and correlate those metagenes with the identity of the clusters.

# Print PC metagenes.
print(seurat_cluster[["pca"]], dims = 1:12, nfeatures = 5)
## PC_ 1 
## Positive:  Mgp, Col3a1, Gsn, Col1a2, Col1a1 
## Negative:  Sftpc, Lyz2, Sftpa1, Cxcl15, Sftpb 
## PC_ 2 
## Positive:  Mgp, Sftpc, Col3a1, Lyz2, Col1a2 
## Negative:  Sec14l3, Cyp2f2, AU040972, Tmem212, Aldh1a1 
## PC_ 3 
## Positive:  Ctla2a, Vwf, Cldn5, Lyve1, Ptprb 
## Negative:  Mgp, Aldh1a1, Cyp2f2, Sec14l3, AU040972 
## PC_ 4 
## Positive:  Ager, Hopx, Rtkn2, Igfbp2, S100a6 
## Negative:  Gsn, Dcn, Vwf, Lyz2, Sftpc 
## PC_ 5 
## Positive:  Gsn, Inmt, Ager, Hopx, Mfap4 
## Negative:  Spp1, Timp1, Ccl21a, Acta2, Fn1 
## PC_ 6 
## Positive:  Spp1, Vwf, Timp1, Acta2, Tagln 
## Negative:  Ccl21a, Mmrn1, Prox1, Flt4, Reln 
## PC_ 7 
## Positive:  Dcn, Col1a2, Igfbp4, Col1a1, Pi16 
## Negative:  Inmt, Mfap4, Scgb1a1, Gpx3, Npnt 
## PC_ 8 
## Positive:  Inmt, Ager, Sftpc, Hopx, Sec14l3 
## Negative:  Scgb1a1, Scgb3a2, Scgb3a1, Bpifa1, Cyp2f2 
## PC_ 9 
## Positive:  Car4, Cldn5, Tmem100, Kdr, Dcn 
## Negative:  Vwf, Selp, Fabp4, Vcam1, Aqp1 
## PC_ 10 
## Positive:  Acta2, Tagln, Gsn, Tpm2, Dcn 
## Negative:  Serpina3n, Eln, Mgp, Saa3, Fn1 
## PC_ 11 
## Positive:  Lyz1, Cbr2, Malat1, Thbs1, Cdkn1c 
## Negative:  S100a6, Cd177, Wfdc2, AU040972, Cxcl17 
## PC_ 12 
## Positive:  Lyz2, Eln, Col1a1, Scgb1a1, Col1a2 
## Negative:  Lyz1, S100a6, Cd177, Clu, Wfdc2

Step 5.2: Explore cell identities with canonical markers

Step 5.2.1: Explore AT1 clusters

The SCTransformed data was performed only on the 3000 most variable genes. Ideally, we want to explore the identity of the clusters with all the genes in the dataset and not just the top 3000 most variable genes, so we need to set the default assay back to the RNA assay.

# Set default assay back to RNA.
DefaultAssay(seurat_cluster) <- "RNA"

We can explore canonical cell type markers of our cell types of interest and visualize their expression on our UMAP. Canonical markers are obtained from LungMAP and A molecular cell atlas of the human lung from single-cell RNA sequencing (2020) Nature

AT1 cells are defined by the cell markers: Hopx, Ager, Rtkn2, Pdpn and Clic5. We can visualize both the UMAP and Dotplot to see which cluster(s) do these markers localize.

# Visualize AT1 markers expression on dotplot.
DotPlot(seurat_cluster,
        features = c("Hopx", "Ager", "Rtkn2", "Pdpn", "Clic5"),
        cluster.idents = TRUE,
        cols = c("#d6cfc7", "#01013f"),
        dot.scale = 6,
        scale.by = "radius") +
  ggtitle("AT1 markers") +
  theme(plot.title = element_text(hjust = 0.5))

Step 5.2.2: Explore AT2 clusters

AT2 cells are defined by the cell markers: Sftpb, Sftpc, Sftpd, Muc1, Etv5 and Lamp3.

# Visualize AT2 markers expression on dotplot.
DotPlot(seurat_cluster,
        features = c("Sftpb", "Sftpc", "Sftpd", "Muc1", "Etv5", "Lamp3"),
        cluster.idents = TRUE,
        cols = c("#d6cfc7", "#01013f"),
        dot.scale = 6,
        scale.by = "radius") +
  ggtitle("AT2 markers") +
  theme(plot.title = element_text(hjust = 0.5))

Step 5.2.3: Explore fibroblast clusters

Alveolar fibroblast cells are defined by the cell markers: Pdgfra, Tcf21, Wnt2, Mfap5, Col1a1.

Pericytes are defined by the cell markers: Pdgfrb, Trpc6, Cspg4.

# Visualize alveolar fibroblast markers expression on dotplot.
DotPlot(seurat_cluster,
        features = c("Tcf21", "Wnt2", "Mfap5", "Pdgfra", "Pdgfrb", "Trpc6", "Col1a1"),
        cluster.idents = TRUE,
        cols = c("#d6cfc7", "#01013f"),
        dot.scale = 6,
        scale.by = "radius") +
  ggtitle("Fibroblast markers") +
  theme(plot.title = element_text(hjust = 0.5))

Step 5.2.4: Explore myofibroblast clusters

Myofibroblast cells are defined by the cell markers: Wt1, Fgf18, Dach2, Frem2, Eln, Acta2.

# Visualize myofibroblast markers expression on dotplot.
DotPlot(seurat_cluster,
        features = c("Wt1", "Fgf18", "Dach2", "Frem2", "Acta2"),
        cluster.idents = TRUE,
        cols = c("#d6cfc7", "#01013f"),
        dot.scale = 6,
        scale.by = "radius") +
  ggtitle("Myofibroblast markers") +
  theme(plot.title = element_text(hjust = 0.5))

Step 5.3: Cell type annotation

Step 5.3.1: Reference-based Azimuth annotation

Azimuth uses a reference-based mapping with the Human Biomolecular Atlas Project, containing references for multiple human tissues.

# Check all available Azimuth references.
available_data <- AvailableData()
available_data[grep("Azimuth", available_data[, 3]), 1:3] # Use lungref.
##                                  Dataset Version                        Summary
## adiposeref.SeuratData         adiposeref   1.0.0     Azimuth Reference: adipose
## bonemarrowref.SeuratData   bonemarrowref   1.0.0  Azimuth Reference: bonemarrow
## fetusref.SeuratData             fetusref   1.0.0       Azimuth Reference: fetus
## heartref.SeuratData             heartref   1.0.0       Azimuth Reference: heart
## humancortexref.SeuratData humancortexref   1.0.0 Azimuth Reference: humancortex
## kidneyref.SeuratData           kidneyref   1.0.2      Azimuth Reference: kidney
## lungref.SeuratData               lungref   2.0.0        Azimuth Reference: lung
## mousecortexref.SeuratData mousecortexref   1.0.0 Azimuth Reference: mousecortex
## pancreasref.SeuratData       pancreasref   1.0.0    Azimuth Reference: pancreas
## pbmcref.SeuratData               pbmcref   1.0.0        Azimuth Reference: pbmc
## tonsilref.SeuratData           tonsilref   2.0.0      Azimuth Reference: tonsil
# Run Azimuth with lungref.
options(timeout = 1000)
seurat_annotated <- RunAzimuth(seurat_cluster, reference = "lungref", assay = "RNA")
## Warning: Overwriting miscellanous data for model
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## detected inputs from MOUSE with id type Gene.name
## reference rownames detected HUMAN with id type Gene.name
## Found 16100 out of 22546 total inputs in conversion table
## Normalizing query using reference SCT model
## Warning: No layers found matching search pattern provided
## Warning: 889 features of the features specified were not present in both the reference query assays. 
## Continuing with remaining 2111 features.
## Projecting cell embeddings
## Finding query neighbors
## Finding neighborhoods
## Finding anchors
##  Found 17335 anchors
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## 
## Integrating dataset 2 with reference dataset
## Finding integration vectors
## Integrating data
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_dr_ to integrateddr_
## Computing nearest neighbors
## Running UMAP projection
## Warning in RunUMAP.default(object = neighborlist, reduction.model =
## reduction.model, : Number of neighbors between query and reference is not equal
## to the number of neighbors within reference
## 11:50:42 Read 80505 rows
## 11:50:42 Processing block 1 of 1
## 11:50:42 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
## 11:50:42 Initializing by weighted average of neighbor coordinates using 1 thread
## 11:50:42 Commencing optimization for 67 epochs, with 1610100 positive edges
## 11:50:49 Finished
## Warning: No assay specified, setting assay as RNA by default.
## Projecting reference PCA onto query
## Finding integration vector weights
## Projecting back the query cells into original PCA space
## Finding integration vector weights
## Computing scores:
##     Finding neighbors of original query cells
##     Finding neighbors of transformed query cells
##     Computing query SNN
##     Determining bandwidth and computing transition probabilities
## Total elapsed time: 54.9975788593292
# Remove unneeded object.
rm(available_data)
# Assign desired cluster resolution as identity of Seurat.
Idents(object = seurat_annotated) <- "predicted.ann_finest_level"

# Visualize annotated cluster with a UMAP.
DimPlot(seurat_annotated,
        reduction = "umap",
        group.by = "predicted.ann_finest_level",
        label = TRUE,
        label.size = 4,
        repel = TRUE) + 
  ggtitle("Azimuth annotation") +
  theme(plot.title = element_text(hjust = 0.5)) +
  NoLegend()
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Visualize annotation prediction score.
FeaturePlot(seurat_annotated, 
            reduction = "umap", 
            features = "mapping.score", 
            cols = c("#01013f", "#59974d", "#fff000")) +
  theme(axis.text.x = element_text(size = 8),
        axis.text.y = element_text(size = 8)) +
  ggtitle("Annotation prediction score")

# Get metadata.
annotation_meta = seurat_annotated@meta.data

# Subset the required columns.
annotation_meta <- annotation_meta %>% subset(select = c(SCT_snn_res.1, predicted.ann_finest_level, mapping.score))

# Group the table by cluster and annotation.
annotation_meta <- annotation_meta %>% 
  dplyr::rename(cluster = "SCT_snn_res.1", 
                cluster_annotation = "predicted.ann_finest_level") %>%
  group_by(cluster, cluster_annotation) %>% 
  summarise(mean_mapping_score = mean(mapping.score))
## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.
# Visualize the probable annotations for each cluster.
annotation_meta <- annotation_meta %>% arrange(cluster, desc(mean_mapping_score)) %>% slice_head(n = 3)
annotation_meta
## # A tibble: 107 × 3
## # Groups:   cluster [39]
##    cluster cluster_annotation      mean_mapping_score
##    <fct>   <chr>                                <dbl>
##  1 0       AT2                                  0.642
##  2 1       Smooth muscle                        1    
##  3 1       AT2 proliferating                    0.724
##  4 1       AT2                                  0.678
##  5 2       Club (non-nasal)                     0.990
##  6 2       Lymphatic EC mature                  0.938
##  7 2       Adventitial fibroblasts              0.854
##  8 3       Suprabasal                           0.992
##  9 3       Pericytes                            0.955
## 10 3       Smooth muscle                        0.911
## # ℹ 97 more rows
# Remove unneeded object.
rm(annotation_meta)

Step 5.3.2: Marker-based ScType annotation

We can cross-check the reference-based annotation with the ScType annotation for validation.

# Load in ScType functions.
source("~/Documents/Work/UnityHealth/Data_Projects/auto_detect_tissue_type_KK.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")

# Set path to ScType reference database.
db_ = "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx"

# Check tissue type for validation.
tissue_guess <- auto_detect_tissue_type(path_to_db_file = db_, 
                                        seuratObject = seurat_cluster, 
                                        scaled = TRUE, 
                                        assay = "SCT") 
## [1] "Checking...Immune system"
## Selecting by scores
## [1] "Checking...Pancreas"
## Selecting by scores
## [1] "Checking...Liver"
## Selecting by scores
## [1] "Checking...Eye"
## Selecting by scores
## [1] "Checking...Kidney"
## Selecting by scores
## [1] "Checking...Brain"
## Selecting by scores
## [1] "Checking...Lung"
## Selecting by scores
## [1] "Checking...Adrenal"
## Selecting by scores
## [1] "Checking...Heart"
## Selecting by scores
## [1] "Checking...Intestine"
## Selecting by scores
## [1] "Checking...Muscle"
## Selecting by scores
## [1] "Checking...Placenta"
## Selecting by scores
## [1] "Checking...Spleen"
## Selecting by scores
## [1] "Checking...Stomach"
## Selecting by scores
## [1] "Checking...Thymus"
## Selecting by scores
## [1] "Checking...Hippocampus"
## Selecting by scores

# Define Seurat object.
seuobj = seurat_cluster

# Specify tissue of interest.
tissue = "Lung"

# Prepare gene sets. 
gs_list <- gene_sets_prepare(db_, tissue)
    
# Check Seurat object version - count matrix is extracted differently between versions.
seurat_package_v5 <- isFALSE("counts" %in% names(attributes(seuobj[["RNA"]])))
print(sprintf("Seurat object %s is used", ifelse(seurat_package_v5, "v5", "v4")))
## [1] "Seurat object v5 is used"
# Extract scaled scRNAseq matrix.
scRNAseqData_scaled <- if (seurat_package_v5) as.matrix(seuobj[["RNA"]]$scale.data) else 
  as.matrix(seuobj[["RNA"]]@scale.data)
    
# Run ScType.
es.max <- sctype_score(scRNAseqData = scRNAseqData_scaled, 
                       scaled = TRUE, 
                       gs = gs_list$gs_positive, 
                       gs2 = gs_list$gs_negative)
    
# Merge ScType results by clusters.
cL_results <- do.call("rbind", 
                      lapply(unique(seuobj@meta.data$seurat_clusters), 
                             function(cl){es.max.cl = sort(
                               rowSums(
                                 es.max[, rownames(
                                   seuobj@meta.data[
                                     seuobj@meta.data$seurat_clusters == cl, ])]),
                               decreasing = !0)
                             head(data.frame(cluster = cl,
                                             type = names(es.max.cl),
                                             scores = es.max.cl,
                                             ncells = sum(seuobj@meta.data$seurat_clusters == cl)), 10)
                             }))
sctype_scores <- cL_results %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
    
# Set low-confident (low ScType score) clusters to "Unknown".
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] <- "Unknown"
print(sctype_scores[, 1:3])
## # A tibble: 39 × 3
## # Groups:   cluster [39]
##    cluster type                             scores
##    <fct>   <chr>                             <dbl>
##  1 8       Pulmonary alveolar type II cells  9640.
##  2 1       Pulmonary alveolar type II cells 14485.
##  3 4       Fibroblasts                      20590.
##  4 15      Pulmonary alveolar type II cells  6309.
##  5 3       Endothelial cell                 20232.
##  6 0       Pulmonary alveolar type II cells 18099.
##  7 25      Pulmonary alveolar type II cells  3083.
##  8 20      Fibroblasts                       2710.
##  9 19      Pulmonary alveolar type I cells  13307.
## 10 28      Pulmonary alveolar type II cells  1448.
## # ℹ 29 more rows
# Add annotated cell types to the metadata.
seurat_cluster@meta.data$sctype_annotation = ""

for(j in unique(sctype_scores$cluster)){
  cl_type = sctype_scores[sctype_scores$cluster==j, ]
  seurat_cluster@meta.data$sctype_annotation[seurat_cluster@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}
# Plot UMAP with ScType annotation.
DimPlot(seurat_cluster, 
        reduction = "umap", 
        label = TRUE, 
        label.size = 3,
        repel = TRUE, 
        group.by = "sctype_annotation") + 
  NoLegend()

# Remove unneeded objects.
rm(auto_detect_tissue_type, cL_results, cl_type, db_, es.max,
   gene_sets_prepare, gs_list, j, scRNAseqData_scaled, sctype_score,
   sctype_scores, seuobj, seurat_package_v5, tissue, tissue_guess)

Step 5.4: Explore cell subtypes of interest

Step 5.4.1: Add the Azimuth annotation to original seurat object

# Add metadata from seurat_annotated, which contains the Azimuth annotation and ScType annotation to seurat_cluster.
seurat_cluster@meta.data <- seurat_annotated@meta.data

# Get Ensembl annotation data.
# Connect to AnnotationHub.
ah <- AnnotationHub()

# Access the Ensembl database for organism.
ahDb <- query(ah, 
              pattern = c("Mus musculus", "EnsDb"), 
              ignore.case = TRUE)

# Acquire the latest annotation files.
id <- ahDb %>%
  mcols() %>%
  rownames() %>%
  tail(n = 1)

# Download the appropriate Ensembldb database.
edb <- ah[[id]]
## loading from cache
# Extract gene-level information from database.
annotations <- genes(edb, return.type = "data.frame")

# Select annotations of interest.
annotations <- annotations %>%
  dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)

# Remove unneeded objects.
rm(ah, ahDb, edb, id)

Step 5.4.2: PDGFR-a fibroblast (Good)

Based on the Azimuth annotation, let’s visualize which fibroblast expresses PDGFR-a.

# Plot the marker expression in the annotated UMAP.
Idents(seurat_cluster) <- "predicted.ann_finest_level"
pdgfra_umap <- FeaturePlot(seurat_cluster, 
                           reduction = "umap", 
                           features = "Pdgfra", 
                           label = FALSE,
                           order = TRUE,
                           cols = c("#01013f", "#59974d", "#fff000")) +
  ggtitle("PDGFR-a expression") +
  theme(axis.text.x = element_text(size = 8),
        axis.text.y = element_text(size = 8)) 

LabelClusters(pdgfra_umap, id = "ident", color = "#e97451", fontface = "bold", repel = TRUE)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Confirm the cluster(s) that express the marker.
DotPlot(seurat_cluster, features = "Pdgfra", 
        assay = "RNA", cols = c("#d6cfc7", "#01013f"))

It looks like PDGFR-a is mostly expressed by alveolar fibroblasts and peribronchial fibroblasts. Let’s see if we can find out the markers that define this fibroblast.

# Find the cell counts for pericytes in each treatment.
n_cells <- FetchData(seurat_cluster, 
                     vars = c("predicted.ann_finest_level", "treatment")) %>% 
  dplyr::count(predicted.ann_finest_level, treatment) %>%
  subset(., predicted.ann_finest_level %in% c("Alveolar fibroblasts", "Peribronchial fibroblasts"))

# Examine the number of cells between both treatments.
ggplot(n_cells, aes(x = predicted.ann_finest_level, y = n, fill = treatment)) +
  geom_bar(position = position_dodge(), stat = "identity") +
  geom_text(aes(label = n), size = 2.5, vjust = -0.2, position = position_dodge(1)) +
  theme_classic() +
  theme(axis.text.x = element_text(size = 8)) 

# Find conserved markers in the alveolar fibroblast cluster.
source("~/Documents/Work/UnityHealth/Data_Projects/technical_scripts/get_conserved_markers.R")
for (cluster in c("Alveolar fibroblasts", "Peribronchial fibroblasts")) {
  PDGFRA_markers <- get_conserved_markers(seuobj = seurat_cluster,
                                          annotations = annotations,
                                          cluster = cluster,
                                          expt_condition = "treatment")
  assign(cluster, PDGFRA_markers)
}
## Testing group Sham: (Alveolar fibroblasts) vs (AT2, EC general capillary, Adventitial fibroblasts, AT1, Transitional Club-AT2, Multiciliated (non-nasal), EC aerocyte capillary, Basal resting, EC arterial, Lymphatic EC mature, EC venous systemic, Club (non-nasal), Peribronchial fibroblasts, EC venous pulmonary, AT2 proliferating, Suprabasal, Myofibroblasts, Fibromyocytes, Smooth muscle, Lymphatic EC differentiating, Lymphatic EC proliferating, DC2, Club (nasal), SM activated stress response, Pericytes, Goblet (nasal), Deuterosomal, Mast cells, Neuroendocrine, CD8 T cells, CD4 T cells)
## Testing group Bleomycin: (Alveolar fibroblasts) vs (EC venous systemic, Multiciliated (non-nasal), Adventitial fibroblasts, AT1, EC general capillary, Lymphatic EC differentiating, Lymphatic EC mature, AT2, Peribronchial fibroblasts, CD8 T cells, EC aerocyte capillary, Smooth muscle, EC arterial, Myofibroblasts, EC venous pulmonary, SM activated stress response, Lymphatic EC proliferating, Goblet (nasal), Basal resting, Neuroendocrine, Club (non-nasal), DC2, Transitional Club-AT2, Ionocyte, CD4 T cells, Pericytes, Club (nasal), AT2 proliferating, Fibromyocytes, Suprabasal, B cells, Non-classical monocytes, Mast cells, SMG duct, Plasma cells, Mesothelium, Alveolar macrophages, Subpleural fibroblasts, Multiciliated (nasal), Deuterosomal)
## Testing group Sham: (Peribronchial fibroblasts) vs (AT2, Alveolar fibroblasts, EC general capillary, Adventitial fibroblasts, AT1, Transitional Club-AT2, Multiciliated (non-nasal), EC aerocyte capillary, Basal resting, EC arterial, Lymphatic EC mature, EC venous systemic, Club (non-nasal), EC venous pulmonary, AT2 proliferating, Suprabasal, Myofibroblasts, Fibromyocytes, Smooth muscle, Lymphatic EC differentiating, Lymphatic EC proliferating, DC2, Club (nasal), SM activated stress response, Pericytes, Goblet (nasal), Deuterosomal, Mast cells, Neuroendocrine, CD8 T cells, CD4 T cells)
## Testing group Bleomycin: (Peribronchial fibroblasts) vs (EC venous systemic, Multiciliated (non-nasal), Adventitial fibroblasts, AT1, EC general capillary, Lymphatic EC differentiating, Lymphatic EC mature, AT2, Alveolar fibroblasts, CD8 T cells, EC aerocyte capillary, Smooth muscle, EC arterial, Myofibroblasts, EC venous pulmonary, SM activated stress response, Lymphatic EC proliferating, Goblet (nasal), Basal resting, Neuroendocrine, Club (non-nasal), DC2, Transitional Club-AT2, Ionocyte, CD4 T cells, Pericytes, Club (nasal), AT2 proliferating, Fibromyocytes, Suprabasal, B cells, Non-classical monocytes, Mast cells, SMG duct, Plasma cells, Mesothelium, Alveolar macrophages, Subpleural fibroblasts, Multiciliated (nasal), Deuterosomal)
# Extract top 20 markers in the cluster.
PDGFRA_markers <- rbind(`Alveolar fibroblasts`, `Peribronchial fibroblasts`) %>%
  mutate(avg_fc = (Bleomycin_avg_log2FC + Sham_avg_log2FC)/2) %>%
  group_by(cluster_id) %>%
  top_n(n = 20,
        wt = avg_fc)
kable(PDGFRA_markers)
cluster_id gene Sham_p_val Sham_avg_log2FC Sham_pct.1 Sham_pct.2 Sham_p_val_adj Bleomycin_p_val Bleomycin_avg_log2FC Bleomycin_pct.1 Bleomycin_pct.2 Bleomycin_p_val_adj max_pval minimump_p_val description avg_fc
Alveolar fibroblasts Itga8 0 5.710093 0.782 0.024 0 0 4.121631 0.619 0.077 0 0 0 integrin alpha 8 [Source:MGI Symbol;Acc:MGI:109442] 4.915862
Alveolar fibroblasts Col13a1 0 5.904985 0.609 0.014 0 0 3.928055 0.569 0.060 0 0 0 collagen, type XIII, alpha 1 [Source:MGI Symbol;Acc:MGI:1277201] 4.916520
Alveolar fibroblasts Slc38a5 0 5.791733 0.506 0.012 0 0 4.031316 0.369 0.033 0 0 0 solute carrier family 38, member 5 [Source:MGI Symbol;Acc:MGI:2148066] 4.911524
Alveolar fibroblasts Scube2 0 6.161223 0.491 0.015 0 0 4.361700 0.353 0.035 0 0 0 signal peptide, CUB domain, EGF-like 2 [Source:MGI Symbol;Acc:MGI:1928765] 5.261462
Alveolar fibroblasts Fgfr4 0 6.950079 0.312 0.004 0 0 4.793985 0.188 0.011 0 0 0 fibroblast growth factor receptor 4 [Source:MGI Symbol;Acc:MGI:95525] 5.872032
Alveolar fibroblasts Slc7a10 0 7.778898 0.249 0.002 0 0 5.858652 0.118 0.003 0 0 0 solute carrier family 7 (cationic amino acid transporter, y+ system), member 10 [Source:MGI Symbol;Acc:MGI:1858261] 6.818775
Alveolar fibroblasts Amph 0 5.880662 0.168 0.004 0 0 4.159611 0.119 0.012 0 0 0 amphiphysin [Source:MGI Symbol;Acc:MGI:103574] 5.020136
Alveolar fibroblasts Adra1a 0 6.820929 0.138 0.002 0 0 4.349089 0.112 0.007 0 0 0 adrenergic receptor, alpha 1a [Source:MGI Symbol;Acc:MGI:104773] 5.585009
Alveolar fibroblasts Slc22a3 0 5.987598 0.123 0.003 0 0 4.127269 0.072 0.005 0 0 0 solute carrier family 22 (organic cation transporter), member 3 [Source:MGI Symbol;Acc:MGI:1333817] 5.057434
Alveolar fibroblasts Adrb3 0 5.844961 0.117 0.004 0 0 3.920863 0.076 0.009 0 0 0 adrenergic receptor, beta 3 [Source:MGI Symbol;Acc:MGI:87939] 4.882912
Alveolar fibroblasts Colq 0 7.685366 0.092 0.001 0 0 5.236795 0.078 0.003 0 0 0 collagen like tail subunit of asymmetric acetylcholinesterase [Source:MGI Symbol;Acc:MGI:1338761] 6.461080
Alveolar fibroblasts Cass4 0 5.857382 0.058 0.001 0 0 3.877901 0.044 0.006 0 0 0 Cas scaffolding protein family member 4 [Source:MGI Symbol;Acc:MGI:2444482] 4.867642
Alveolar fibroblasts Mettl21e 0 6.246712 0.043 0.001 0 0 4.249276 0.027 0.002 0 0 0 methyltransferase like 21E [Source:MGI Symbol;Acc:MGI:2685837] 5.247994
Alveolar fibroblasts Slc13a4 0 6.813269 0.035 0.000 0 0 3.907303 0.016 0.002 0 0 0 solute carrier family 13 (sodium/sulfate symporters), member 4 [Source:MGI Symbol;Acc:MGI:2442367] 5.360286
Alveolar fibroblasts Gm48662 0 6.335692 0.037 0.001 0 0 3.498129 0.028 0.003 0 0 0 predicted gene, 48662 [Source:MGI Symbol;Acc:MGI:6098277] 4.916910
Alveolar fibroblasts Ano5 0 6.605714 0.033 0.000 0 0 4.295960 0.022 0.001 0 0 0 anoctamin 5 [Source:MGI Symbol;Acc:MGI:3576659] 5.450837
Alveolar fibroblasts Ms4a4c 0 7.359657 0.023 0.000 0 0 3.621467 0.012 0.001 0 0 0 membrane-spanning 4-domains, subfamily A, member 4C [Source:MGI Symbol;Acc:MGI:1927656] 5.490562
Alveolar fibroblasts Kcng1 0 5.457614 0.012 0.000 0 0 4.535511 0.015 0.001 0 0 0 potassium voltage-gated channel, subfamily G, member 1 [Source:MGI Symbol;Acc:MGI:3616086] 4.996563
Alveolar fibroblasts Cntn1 0 6.079370 0.014 0.000 0 0 3.802363 0.015 0.002 0 0 0 contactin 1 [Source:MGI Symbol;Acc:MGI:105980] 4.940867
Alveolar fibroblasts Gm13703 0 6.468622 0.012 0.000 0 0 3.758446 0.016 0.002 0 0 0 predicted gene 13703 [Source:MGI Symbol;Acc:MGI:3651307] 5.113534
Peribronchial fibroblasts Eln 0 4.122702 0.861 0.139 0 0 2.825576 0.915 0.354 0 0 0 elastin [Source:MGI Symbol;Acc:MGI:95317] 3.474140
Peribronchial fibroblasts Col6a3 0 3.986600 0.788 0.077 0 0 2.967634 0.866 0.199 0 0 0 collagen, type VI, alpha 3 [Source:MGI Symbol;Acc:MGI:88461] 3.477117
Peribronchial fibroblasts Cxcl14 0 4.257647 0.628 0.057 0 0 2.384786 0.776 0.179 0 0 0 C-X-C motif chemokine ligand 14 [Source:MGI Symbol;Acc:MGI:1888514] 3.321217
Peribronchial fibroblasts Col5a1 0 3.677700 0.623 0.062 0 0 2.907386 0.812 0.174 0 0 0 collagen, type V, alpha 1 [Source:MGI Symbol;Acc:MGI:88457] 3.292543
Peribronchial fibroblasts Ltbp2 0 4.583246 0.454 0.031 0 0 2.912186 0.650 0.107 0 0 0 latent transforming growth factor beta binding protein 2 [Source:MGI Symbol;Acc:MGI:99502] 3.747716
Peribronchial fibroblasts Lamc3 0 4.674723 0.284 0.016 0 0 2.043056 0.149 0.033 0 0 0 laminin gamma 3 [Source:MGI Symbol;Acc:MGI:1344394] 3.358890
Peribronchial fibroblasts D430041D05Rik 0 4.960910 0.169 0.007 0 0 2.454889 0.080 0.013 0 0 0 RIKEN cDNA D430041D05 gene [Source:MGI Symbol;Acc:MGI:2181743] 3.707900
Peribronchial fibroblasts Ccn4 0 4.260328 0.177 0.012 0 0 2.546925 0.403 0.063 0 0 0 cellular communication network factor 4 [Source:MGI Symbol;Acc:MGI:1197008] 3.403627
Peribronchial fibroblasts Col5a3 0 3.895073 0.164 0.011 0 0 3.512139 0.363 0.048 0 0 0 collagen, type V, alpha 3 [Source:MGI Symbol;Acc:MGI:1858212] 3.703606
Peribronchial fibroblasts Col28a1 0 5.387062 0.069 0.002 0 0 3.256865 0.285 0.032 0 0 0 collagen, type XXVIII, alpha 1 [Source:MGI Symbol;Acc:MGI:2685312] 4.321963
Peribronchial fibroblasts Adamts12 0 4.252873 0.108 0.007 0 0 2.570841 0.255 0.032 0 0 0 ADAM metallopeptidase with thrombospondin type 1 motif 12 [Source:MGI Symbol;Acc:MGI:2146046] 3.411857
Peribronchial fibroblasts Gpr176 0 5.281970 0.026 0.001 0 0 2.582730 0.121 0.016 0 0 0 G protein-coupled receptor 176 [Source:MGI Symbol;Acc:MGI:2685858] 3.932350
Peribronchial fibroblasts Megf10 0 4.226124 0.037 0.002 0 0 3.283944 0.093 0.008 0 0 0 multiple EGF-like-domains 10 [Source:MGI Symbol;Acc:MGI:2685177] 3.755034
Peribronchial fibroblasts C1qtnf3 0 4.753842 0.162 0.007 0 0 2.134169 0.044 0.009 0 0 0 C1q and tumor necrosis factor related protein 3 [Source:MGI Symbol;Acc:MGI:1932136] 3.444005
Peribronchial fibroblasts A2m 0 4.166858 0.017 0.001 0 0 2.732664 0.075 0.010 0 0 0 alpha-2-macroglobulin [Source:MGI Symbol;Acc:MGI:2449119] 3.449761
Peribronchial fibroblasts Col24a1 0 4.986037 0.015 0.001 0 0 3.029205 0.049 0.004 0 0 0 collagen, type XXIV, alpha 1 [Source:MGI Symbol;Acc:MGI:1918605] 4.007621
Peribronchial fibroblasts Tmem200a 0 4.619795 0.052 0.002 0 0 2.306634 0.067 0.009 0 0 0 transmembrane protein 200A [Source:MGI Symbol;Acc:MGI:1924470] 3.463214
Peribronchial fibroblasts Hoxb8 0 4.778536 0.017 0.001 0 0 2.253017 0.063 0.010 0 0 0 homeobox B8 [Source:MGI Symbol;Acc:MGI:96189] 3.515777
Peribronchial fibroblasts Mrgprf 0 4.672262 0.058 0.003 0 0 1.921079 0.035 0.005 0 0 0 MAS-related GPR, member F [Source:MGI Symbol;Acc:MGI:2384823] 3.296670
Peribronchial fibroblasts Gm5084 0 5.185958 0.020 0.001 0 0 2.519955 0.015 0.003 0 0 0 predicted gene 5084 [Source:MGI Symbol;Acc:MGI:3647835] 3.852957
# Remove unneeded objects.
rm(pdgfra_umap, n_cells, `Alveolar fibroblasts`, `Peribronchial fibroblasts`, cluster, PDGFRA_markers)

Step 5.4.3: PDGFR-b fibroblast (Bad)

Based on the Azimuth annotation, let’s visualize which fibroblast expresses PDGFR-b.

# Plot the marker expression in the annotated UMAP.
Idents(seurat_cluster) <- "predicted.ann_finest_level"
pdgfrb_umap <- FeaturePlot(seurat_cluster, 
                           reduction = "umap", 
                           features = "Pdgfrb", 
                           label = FALSE,
                           order = TRUE,
                           cols = c("#01013f", "#59974d", "#fff000")) +
  ggtitle("PDGFR-b expression") +
  theme(axis.text.x = element_text(size = 8),
        axis.text.y = element_text(size = 8)) 

LabelClusters(pdgfrb_umap, id = "ident", color = "#e97451", fontface = "bold", repel = TRUE)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Confirm the cluster(s) that express the marker.
DotPlot(seurat_cluster, features = "Pdgfrb", 
        assay = "RNA", cols = c("#d6cfc7", "#01013f"))

It looks like PDGFR-b is mostly expressed by pericytes. Let’s see if we can find out the markers that define this fibroblast.

# Find the cell counts for pericytes in each treatment.
n_cells <- FetchData(seurat_cluster, 
                     vars = c("predicted.ann_finest_level", "treatment")) %>% 
  dplyr::count(predicted.ann_finest_level, treatment) %>%
  subset(., predicted.ann_finest_level == "Pericytes")

# Examine the number of cells between both treatments.
ggplot(n_cells, aes(x = predicted.ann_finest_level, y = n, fill = treatment)) +
  geom_bar(position = position_dodge(), stat = "identity") +
  geom_text(aes(label = n), size = 2.5, vjust = -0.2, position = position_dodge(1)) +
  theme_classic() +
  theme(axis.text.x = element_text(size = 8)) 

# Find conserved markers in the pericytes cluster.
source("~/Documents/Work/UnityHealth/Data_Projects/technical_scripts/get_conserved_markers.R")
pericytes_markers <- get_conserved_markers(seuobj = seurat_cluster,
                                           annotations = annotations,
                                           cluster = "Pericytes",
                                           expt_condition = "treatment")
## Testing group Sham: (Pericytes) vs (AT2, Alveolar fibroblasts, EC general capillary, Adventitial fibroblasts, AT1, Transitional Club-AT2, Multiciliated (non-nasal), EC aerocyte capillary, Basal resting, EC arterial, Lymphatic EC mature, EC venous systemic, Club (non-nasal), Peribronchial fibroblasts, EC venous pulmonary, AT2 proliferating, Suprabasal, Myofibroblasts, Fibromyocytes, Smooth muscle, Lymphatic EC differentiating, Lymphatic EC proliferating, DC2, Club (nasal), SM activated stress response, Goblet (nasal), Deuterosomal, Mast cells, Neuroendocrine, CD8 T cells, CD4 T cells)
## Testing group Bleomycin: (Pericytes) vs (EC venous systemic, Multiciliated (non-nasal), Adventitial fibroblasts, AT1, EC general capillary, Lymphatic EC differentiating, Lymphatic EC mature, AT2, Peribronchial fibroblasts, Alveolar fibroblasts, CD8 T cells, EC aerocyte capillary, Smooth muscle, EC arterial, Myofibroblasts, EC venous pulmonary, SM activated stress response, Lymphatic EC proliferating, Goblet (nasal), Basal resting, Neuroendocrine, Club (non-nasal), DC2, Transitional Club-AT2, Ionocyte, CD4 T cells, Club (nasal), AT2 proliferating, Fibromyocytes, Suprabasal, B cells, Non-classical monocytes, Mast cells, SMG duct, Plasma cells, Mesothelium, Alveolar macrophages, Subpleural fibroblasts, Multiciliated (nasal), Deuterosomal)
# Extract top 20 markers in the cluster.
pericytes_markers <- pericytes_markers %>%
  mutate(avg_fc = (Bleomycin_avg_log2FC + Sham_avg_log2FC)/2) %>%
  group_by(cluster_id) %>%
  top_n(n = 20,
        wt = avg_fc)
kable(pericytes_markers)
cluster_id gene Sham_p_val Sham_avg_log2FC Sham_pct.1 Sham_pct.2 Sham_p_val_adj Bleomycin_p_val Bleomycin_avg_log2FC Bleomycin_pct.1 Bleomycin_pct.2 Bleomycin_p_val_adj max_pval minimump_p_val description avg_fc
Pericytes Trpc6 0 10.144589 0.312 0.000 0 0 8.986716 0.473 0.006 0 0 0 transient receptor potential cation channel, subfamily C, member 6 [Source:MGI Symbol;Acc:MGI:109523] 9.565653
Pericytes Higd1b 0 10.102512 0.312 0.001 0 0 11.049704 0.582 0.001 0 0 0 HIG1 domain family, member 1B [Source:MGI Symbol;Acc:MGI:1922939] 10.576108
Pericytes Vtn 0 12.722170 0.250 0.000 0 0 9.826647 0.273 0.001 0 0 0 vitronectin [Source:MGI Symbol;Acc:MGI:98940] 11.274409
Pericytes Fam162b 0 9.215323 0.250 0.001 0 0 10.871935 0.273 0.000 0 0 0 family with sequence similarity 162, member B [Source:MGI Symbol;Acc:MGI:1924546] 10.043629
Pericytes Padi3 0 14.691993 0.125 0.000 0 0 8.370606 0.036 0.000 0 0 0 peptidyl arginine deiminase, type III [Source:MGI Symbol;Acc:MGI:1338891] 11.531300
Pericytes Cstdc2 0 11.446853 0.125 0.000 0 0 11.276197 0.182 0.000 0 0 0 cystatin domain containing 2 [Source:MGI Symbol;Acc:MGI:1924955] 11.361525
Pericytes Adcy8 0 10.546986 0.062 0.000 0 0 11.174731 0.218 0.000 0 0 0 adenylate cyclase 8 [Source:MGI Symbol;Acc:MGI:1341110] 10.860859
Pericytes Vsnl1 0 9.084571 0.500 0.003 0 0 6.903979 0.182 0.005 0 0 0 visinin-like 1 [Source:MGI Symbol;Acc:MGI:1349453] 7.994275
Pericytes Lef1 0 10.488495 0.250 0.001 0 0 6.596202 0.200 0.009 0 0 0 lymphoid enhancer binding factor 1 [Source:MGI Symbol;Acc:MGI:96770] 8.542348
Pericytes Cox4i2 0 8.255949 0.688 0.036 0 0 9.229187 0.691 0.021 0 0 0 cytochrome c oxidase subunit 4I2 [Source:MGI Symbol;Acc:MGI:2135755] 8.742568
Pericytes Ndufa4l2 0 8.250385 0.688 0.010 0 0 7.470427 0.636 0.022 0 0 0 Ndufa4, mitochondrial complex associated like 2 [Source:MGI Symbol;Acc:MGI:3039567] 7.860406
Pericytes Olfr553 0 12.274668 0.062 0.000 0 0 9.520471 0.018 0.000 0 0 0 NA 10.897570
Pericytes Fabp7 0 10.874978 0.125 0.000 0 0 6.095756 0.036 0.001 0 0 0 fatty acid binding protein 7, brain [Source:MGI Symbol;Acc:MGI:101916] 8.485367
Pericytes Kcnj12 0 10.063393 0.188 0.001 0 0 7.428102 0.073 0.002 0 0 0 potassium inwardly-rectifying channel, subfamily J, member 12 [Source:MGI Symbol;Acc:MGI:108495] 8.745747
Pericytes Card9 0 8.304698 0.062 0.001 0 0 7.716783 0.164 0.002 0 0 0 caspase recruitment domain family, member 9 [Source:MGI Symbol;Acc:MGI:2685628] 8.010741
Pericytes Gucy1b1 0 8.547001 0.688 0.016 0 0 7.404685 0.727 0.050 0 0 0 guanylate cyclase 1, soluble, beta 1 [Source:MGI Symbol;Acc:MGI:1860604] 7.975843
Pericytes Agtr1a 0 8.316184 0.188 0.004 0 0 8.213101 0.218 0.006 0 0 0 angiotensin II receptor, type 1a [Source:MGI Symbol;Acc:MGI:87964] 8.264643
Pericytes Trarg1 0 8.244735 0.125 0.001 0 0 7.507260 0.091 0.002 0 0 0 trafficking regulator of GLUT4 (SLC2A4) 1 [Source:MGI Symbol;Acc:MGI:3029307] 7.875998
Pericytes Kcnq5 0 8.681899 0.125 0.001 0 0 8.037112 0.145 0.003 0 0 0 potassium voltage-gated channel, subfamily Q, member 5 [Source:MGI Symbol;Acc:MGI:1924937] 8.359505
Pericytes Tmem200a 0 9.122794 0.250 0.002 0 0 6.651709 0.255 0.014 0 0 0 transmembrane protein 200A [Source:MGI Symbol;Acc:MGI:1924470] 7.887252
# Remove unneeded objects.
rm(pdgfrb_umap, n_cells, pericytes_markers)

Step 5.4.4: Krt5+ epithelial cells

Based on the Azimuth annotation, let’s visualize which epithelial cells express Krt5.

# Plot the marker expression in the annotated UMAP.
Idents(seurat_cluster) <- "predicted.ann_finest_level"
Krt5_umap <- FeaturePlot(seurat_cluster, 
                         reduction = "umap", 
                         features = "Krt5", 
                         label = FALSE,
                         order = TRUE,
                         cols = c("#01013f", "#59974d", "#fff000")) +
  ggtitle("Krt5+ expression") +
  theme(axis.text.x = element_text(size = 8),
        axis.text.y = element_text(size = 8)) 

LabelClusters(Krt5_umap, id = "ident", color = "#e97451", fontface = "bold", repel = TRUE)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Find out which clusters express the marker.
DotPlot(seurat_cluster, features = "Krt5", 
        assay = "RNA", cols = c("#d6cfc7", "#01013f"))

It looks like Krt5+ is mostly expressed by basal resting and club (nasal) cells. Let’s see if we can find out the markers that define these epithelial cells.

# Find the cell counts for basal and club cells in each treatment.
n_cells <- FetchData(seurat_cluster, 
                     vars = c("predicted.ann_finest_level", "treatment")) %>% 
  dplyr::count(predicted.ann_finest_level, treatment) %>%
  subset(., predicted.ann_finest_level %in% c("Club (nasal)"))

# Examine the number of cells between both treatments.
ggplot(n_cells, aes(x = predicted.ann_finest_level, y = n, fill = treatment)) +
  geom_bar(position = position_dodge(), stat = "identity") +
  geom_text(aes(label = n), size = 2.5, vjust = -0.2, position = position_dodge(1)) +
  theme_classic() +
  theme(axis.text.x = element_text(size = 8)) 

# Find conserved markers in the club (nasal) cluster.
source("~/Documents/Work/UnityHealth/Data_Projects/technical_scripts/get_conserved_markers.R")
club_markers <- get_conserved_markers(seuobj = seurat_cluster,
                                      annotations = annotations,
                                      cluster = "Club (nasal)",
                                      expt_condition = "treatment")
## Testing group Sham: (Club (nasal)) vs (AT2, Alveolar fibroblasts, EC general capillary, Adventitial fibroblasts, AT1, Transitional Club-AT2, Multiciliated (non-nasal), EC aerocyte capillary, Basal resting, EC arterial, Lymphatic EC mature, EC venous systemic, Club (non-nasal), Peribronchial fibroblasts, EC venous pulmonary, AT2 proliferating, Suprabasal, Myofibroblasts, Fibromyocytes, Smooth muscle, Lymphatic EC differentiating, Lymphatic EC proliferating, DC2, SM activated stress response, Pericytes, Goblet (nasal), Deuterosomal, Mast cells, Neuroendocrine, CD8 T cells, CD4 T cells)
## Testing group Bleomycin: (Club (nasal)) vs (EC venous systemic, Multiciliated (non-nasal), Adventitial fibroblasts, AT1, EC general capillary, Lymphatic EC differentiating, Lymphatic EC mature, AT2, Peribronchial fibroblasts, Alveolar fibroblasts, CD8 T cells, EC aerocyte capillary, Smooth muscle, EC arterial, Myofibroblasts, EC venous pulmonary, SM activated stress response, Lymphatic EC proliferating, Goblet (nasal), Basal resting, Neuroendocrine, Club (non-nasal), DC2, Transitional Club-AT2, Ionocyte, CD4 T cells, Pericytes, AT2 proliferating, Fibromyocytes, Suprabasal, B cells, Non-classical monocytes, Mast cells, SMG duct, Plasma cells, Mesothelium, Alveolar macrophages, Subpleural fibroblasts, Multiciliated (nasal), Deuterosomal)
# Extract top 20 markers in the cluster.
club_markers <- head(club_markers, 20)
kable(club_markers)
cluster_id gene Sham_p_val Sham_avg_log2FC Sham_pct.1 Sham_pct.2 Sham_p_val_adj Bleomycin_p_val Bleomycin_avg_log2FC Bleomycin_pct.1 Bleomycin_pct.2 Bleomycin_p_val_adj max_pval minimump_p_val description
Club (nasal) Ifi202b 0 9.536563 0.8 0.001 0 0 7.426929 0.854 0.005 0 0 0 interferon activated gene 202B [Source:MGI Symbol;Acc:MGI:1347083]
Club (nasal) 4833423E24Rik 0 9.635913 0.8 0.001 0 0 7.632552 0.812 0.004 0 0 0 NA
Club (nasal) Tns4 0 9.326516 0.8 0.001 0 0 5.419853 0.229 0.004 0 0 0 tensin 4 [Source:MGI Symbol;Acc:MGI:2144377]
Club (nasal) Dapl1 0 8.420025 0.8 0.002 0 0 7.203046 0.875 0.006 0 0 0 death associated protein-like 1 [Source:MGI Symbol;Acc:MGI:1923997]
Club (nasal) Krt17 0 9.644663 0.8 0.002 0 0 7.569699 0.792 0.006 0 0 0 keratin 17 [Source:MGI Symbol;Acc:MGI:96691]
Club (nasal) Krt5 0 9.360105 0.8 0.002 0 0 7.276484 0.792 0.005 0 0 0 keratin 5 [Source:MGI Symbol;Acc:MGI:96702]
Club (nasal) Gpr87 0 10.925382 0.6 0.000 0 0 7.772418 0.333 0.001 0 0 0 G protein-coupled receptor 87 [Source:MGI Symbol;Acc:MGI:1934133]
Club (nasal) Defb1 0 7.905752 0.6 0.001 0 0 6.463990 0.583 0.005 0 0 0 defensin beta 1 [Source:MGI Symbol;Acc:MGI:1096878]
Club (nasal) Trim29 0 8.438270 0.6 0.001 0 0 6.999176 0.542 0.004 0 0 0 tripartite motif-containing 29 [Source:MGI Symbol;Acc:MGI:1919419]
Club (nasal) Slc25a21 0 9.397660 0.6 0.001 0 0 6.653883 0.146 0.001 0 0 0 solute carrier family 25 (mitochondrial oxodicarboxylate carrier), member 21 [Source:MGI Symbol;Acc:MGI:2445059]
Club (nasal) Mdga2 0 9.436134 0.6 0.001 0 0 5.148588 0.229 0.004 0 0 0 MAM domain containing glycosylphosphatidylinositol anchor 2 [Source:MGI Symbol;Acc:MGI:2444706]
Club (nasal) Pkp1 0 10.516065 0.4 0.000 0 0 8.036203 0.438 0.002 0 0 0 plakophilin 1 [Source:MGI Symbol;Acc:MGI:1328359]
Club (nasal) Foxe1 0 8.538125 0.4 0.000 0 0 6.378833 0.229 0.002 0 0 0 forkhead box E1 [Source:MGI Symbol;Acc:MGI:1353500]
Club (nasal) Sall3 0 9.127323 0.4 0.000 0 0 7.131397 0.208 0.002 0 0 0 spalt like transcription factor 3 [Source:MGI Symbol;Acc:MGI:109295]
Club (nasal) Serpinb5 0 8.441000 0.4 0.001 0 0 6.108664 0.562 0.007 0 0 0 serine (or cysteine) peptidase inhibitor, clade B, member 5 [Source:MGI Symbol;Acc:MGI:109579]
Club (nasal) Dlk2 0 8.265797 0.6 0.001 0 0 6.617187 0.479 0.003 0 0 0 delta like non-canonical Notch ligand 2 [Source:MGI Symbol;Acc:MGI:2146838]
Club (nasal) Fmo6 0 8.337107 0.6 0.002 0 0 7.830383 0.604 0.003 0 0 0 flavin containing monooxygenase 6 [Source:MGI Symbol;Acc:MGI:2681841]
Club (nasal) Dsc3 0 10.278129 0.2 0.000 0 0 7.025309 0.208 0.001 0 0 0 desmocollin 3 [Source:MGI Symbol;Acc:MGI:1194993]
Club (nasal) Pax9 0 6.485695 0.8 0.005 0 0 6.282572 0.625 0.008 0 0 0 paired box 9 [Source:MGI Symbol;Acc:MGI:97493]
Club (nasal) Aqp3 0 8.515569 0.8 0.009 0 0 7.760175 0.917 0.009 0 0 0 aquaporin 3 [Source:MGI Symbol;Acc:MGI:1333777]
# Remove unneeded objects.
rm(Krt5_umap, n_cells, club_markers, get_conserved_markers, annotations)

Step 5.5: Cross-check and collapse annotated cell types

# Remove any cells that are labelled nasal. 
grep("nasal", seurat_cluster$predicted.ann_finest_level, value = TRUE) %>% unique() # Club (nasal), Goblet (nasal), Multiciliated (nasal).
## [1] "Multiciliated (non-nasal)" "Club (non-nasal)"         
## [3] "Club (nasal)"              "Goblet (nasal)"           
## [5] "Multiciliated (nasal)"
dim(seurat_cluster)[2] # 80,505
## [1] 80505
seurat_cluster <- subset(seurat_cluster, 
                         subset = predicted.ann_finest_level %in% c("Club (nasal)",
                                                                    "Goblet (nasal)",
                                                                    "Multiciliated (nasal)"),
                         invert = TRUE)
dim(seurat_cluster)[2] # 80,294 (211 cells removed)
## [1] 80294
# Remove any cells that are labelled mesothelial cells as they are pleural and are not specialized lung cells.  
grep("Mesothelium", seurat_cluster$predicted.ann_finest_level, value = TRUE) %>% unique() # Mesothelium.
## [1] "Mesothelium"
dim(seurat_cluster)[2] # 80,294
## [1] 80294
seurat_cluster <- subset(seurat_cluster, 
                         subset = predicted.ann_finest_level == "Mesothelium",
                         invert = TRUE)
dim(seurat_cluster)[2] # 80,290 (4 cells removed)
## [1] 80290
# Pull cell metadata from the Seurat object.
metadata <- seurat_cluster@meta.data

# Aggregate some annotated cell types.
# Find all fibroblasts and aggregate them together.
grep("fibroblast", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "Alveolar fibroblasts"      "Adventitial fibroblasts"  
## [3] "Peribronchial fibroblasts" "Myofibroblasts"           
## [5] "Subpleural fibroblasts"
metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "Alveolar fibroblasts" = "Fibroblasts",
                            "Adventitial fibroblasts" = "Fibroblasts",
                            "Peribronchial fibroblasts" = "Fibroblasts",
                            "Subpleural fibroblasts" = "Fibroblasts"))

# Find all endothelial cells and aggregate them together.
grep("EC", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "EC general capillary"         "EC aerocyte capillary"       
## [3] "EC arterial"                  "Lymphatic EC mature"         
## [5] "EC venous systemic"           "EC venous pulmonary"         
## [7] "Lymphatic EC differentiating" "Lymphatic EC proliferating"
metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "EC general capillary" = "Endothelial capillary",
                            "EC aerocyte capillary" = "Endothelial capillary",
                            "EC arterial" = "Endothelial arterial",
                            "EC venous systemic" = "Endothelial venous",
                            "EC venous pulmonary" = "Endothelial venous"))

metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "Lymphatic EC mature" = "Endothelial lymphatic",
                            "Lymphatic EC differentiating" = "Endothelial lymphatic",
                            "Lymphatic EC proliferating" = "Endothelial lymphatic"))

# Find all smooth muscle cells and aggregate them together.
grep("SM|smooth muscle", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "SM activated stress response" "SMG duct"
metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "Smooth muscle" = "VSMCs",
                            "SM activated stress response" = "VSMCs"))

# Find all immune cells and aggregate them together.
metadata$predicted.ann_finest_level[metadata$predicted.ann_level_1 == "Immune"] %>% unique()
##  [1] "DC2"                     "Mast cells"             
##  [3] "CD8 T cells"             "Basal resting"          
##  [5] "CD4 T cells"             "AT1"                    
##  [7] "B cells"                 "Non-classical monocytes"
##  [9] "Endothelial capillary"   "Fibroblasts"            
## [11] "Plasma cells"            "Alveolar macrophages"
metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "DC2" = "Immune",
                            "Mast cells" = "Immune",
                            "CD8 T cells" = "Immune",
                            "CD4 T cells" = "Immune",
                            "B cells" = "Immune",
                            "Non-classical monocytes" = "Immune",
                            "Plasma cells" = "Immune",
                            "Alveolar macrophages" = "Immune"))

# Get metadata column and rename.
metadata = (metadata %>%
              subset(select = -cell_type) %>%
              rename("title" = "mice_id",
                     "predicted.ann_finest_level" = "cell_type"))

# Add edited metadata back to Seurat object.
seurat_cluster@meta.data <- metadata

# Factorize the timepoint according to sequential order.
seurat_cluster$timepoint <- factor(x = seurat_cluster$timepoint,
                                   levels = c("Sham", "Day 14", "Day 35"))

# Create a column in metadata slot that holds the treatment and timepoint information.
seurat_cluster$comparison_var = ifelse(seurat_cluster$timepoint == "Sham",
                                       "Sham", 
                                       paste(seurat_cluster$treatment,                     
                                             seurat_cluster$timepoint,  
                                             sep = "_"))

# Factorize the comparison_var according to sequential order.
seurat_cluster$comparison_var <- factor(x = seurat_cluster$comparison_var,
                                        levels = c("Sham", 
                                                   "Bleomycin_Day 14", 
                                                   "Bleomycin_Day 35"))

# Set default idents of Seurat object.
Idents(seurat_cluster) <- "cell_type"

# Remove unneeded object.
rm(seurat_annotated, metadata)

Step 5.6: Final cell type annotation

# Set colour scheme based on annotated cell type.
celltype_col = c("AT1" = "#2f4f4f",
                 "AT2" = "#00bfff",
                 "Basal resting" = "#2e8b57",
                 "Club (non-nasal)" = "#8b008b",
                 "Endothelial arterial" = "#ffa500",
                 "Endothelial capillary" = "#808000",
                 "Endothelial lympathic" = "#00008b",
                 "Endothelial venous" = "#00ff00",
                 "Fibroblasts" = "#ff4500",
                 "Immune" = "#f08080",
                 "Ionocyte" = "#f0e68c",
                 "Multiciliated (non-nasal)" = "#ff00ff",
                 "Myofibroblasts" = "#c71585",
                 "Neuroendocrine" = "#00fa9a",
                 "Pericytes" = "#ff1493",
                 "Suprabasal" = "#eee8aa",
                 "Transitional Club-AT2" = "#9370db",
                 "VSMCs" = "#00ffff")

# Visualize annotated cluster with a UMAP.
DimPlot(seurat_cluster,
        reduction = "umap",
        group.by = "cell_type",
        cols = celltype_col,
        label = TRUE,
        label.size = 4,
        repel = TRUE,
        raster = FALSE) + 
  ggtitle("Final cell type annotation") +
  theme(plot.title = element_text(hjust = 0.5)) 

Step 6: Save Seurat objects.

save(seurat_cluster, 
     file = paste0(supp_file_path, "data/preprocessed_seurat_obj.RData"))

References

Analysis codes are adapted from HBCtraining and Ouyang Lab with credits going to the original authors of the publications cited in the book. Reference materials are adapted from SVI Bioinformatics and Cellular Genomics Lab with credits going to the original authors.